Registration number: 19BCE1717
Faculty: Prof. Parvathi R
Slot: L55 + L56
Course code: CSE3020
Consider the COVID-19 pandemic situation. Analysis and forecast the pandemic trend by using Covid19 dataset
Load necessary libraries:
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(ROCR)
## Warning: package 'ROCR' was built under R version 4.0.2
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.0.2
library(devtools)
## Warning: package 'devtools' was built under R version 4.0.2
## Loading required package: usethis
## Warning: package 'usethis' was built under R version 4.0.2
library(forecast)
## Warning: package 'forecast' was built under R version 4.0.2
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(TTR)
## Warning: package 'TTR' was built under R version 4.0.2
Load the dataset:
data <- read.csv("COVIDdata.csv")
head(data)
## iso_code continent location date total_cases new_cases
## 1 AFG Asia Afghanistan 2020-02-24 5 5
## 2 AFG Asia Afghanistan 2020-02-25 5 0
## 3 AFG Asia Afghanistan 2020-02-26 5 0
## 4 AFG Asia Afghanistan 2020-02-27 5 0
## 5 AFG Asia Afghanistan 2020-02-28 5 0
## 6 AFG Asia Afghanistan 2020-02-29 5 0
## new_cases_smoothed total_deaths new_deaths new_deaths_smoothed
## 1 NA NA NA NA
## 2 NA NA NA NA
## 3 NA NA NA NA
## 4 NA NA NA NA
## 5 NA NA NA NA
## 6 NA NA NA NA
## total_cases_per_million new_cases_per_million new_cases_smoothed_per_million
## 1 0.126 0.126 NA
## 2 0.126 0.000 NA
## 3 0.126 0.000 NA
## 4 0.126 0.000 NA
## 5 0.126 0.000 NA
## 6 0.126 0.000 NA
## total_deaths_per_million new_deaths_per_million
## 1 NA NA
## 2 NA NA
## 3 NA NA
## 4 NA NA
## 5 NA NA
## 6 NA NA
## new_deaths_smoothed_per_million reproduction_rate icu_patients
## 1 NA NA NA
## 2 NA NA NA
## 3 NA NA NA
## 4 NA NA NA
## 5 NA NA NA
## 6 NA NA NA
## icu_patients_per_million hosp_patients hosp_patients_per_million
## 1 NA NA NA
## 2 NA NA NA
## 3 NA NA NA
## 4 NA NA NA
## 5 NA NA NA
## 6 NA NA NA
## weekly_icu_admissions weekly_icu_admissions_per_million
## 1 NA NA
## 2 NA NA
## 3 NA NA
## 4 NA NA
## 5 NA NA
## 6 NA NA
## weekly_hosp_admissions weekly_hosp_admissions_per_million new_tests
## 1 NA NA NA
## 2 NA NA NA
## 3 NA NA NA
## 4 NA NA NA
## 5 NA NA NA
## 6 NA NA NA
## total_tests total_tests_per_thousand new_tests_per_thousand
## 1 NA NA NA
## 2 NA NA NA
## 3 NA NA NA
## 4 NA NA NA
## 5 NA NA NA
## 6 NA NA NA
## new_tests_smoothed new_tests_smoothed_per_thousand positive_rate
## 1 NA NA NA
## 2 NA NA NA
## 3 NA NA NA
## 4 NA NA NA
## 5 NA NA NA
## 6 NA NA NA
## tests_per_case tests_units total_vaccinations people_vaccinated
## 1 NA NA NA
## 2 NA NA NA
## 3 NA NA NA
## 4 NA NA NA
## 5 NA NA NA
## 6 NA NA NA
## people_fully_vaccinated total_boosters new_vaccinations
## 1 NA NA NA
## 2 NA NA NA
## 3 NA NA NA
## 4 NA NA NA
## 5 NA NA NA
## 6 NA NA NA
## new_vaccinations_smoothed total_vaccinations_per_hundred
## 1 NA NA
## 2 NA NA
## 3 NA NA
## 4 NA NA
## 5 NA NA
## 6 NA NA
## people_vaccinated_per_hundred people_fully_vaccinated_per_hundred
## 1 NA NA
## 2 NA NA
## 3 NA NA
## 4 NA NA
## 5 NA NA
## 6 NA NA
## total_boosters_per_hundred new_vaccinations_smoothed_per_million
## 1 NA NA
## 2 NA NA
## 3 NA NA
## 4 NA NA
## 5 NA NA
## 6 NA NA
## new_people_vaccinated_smoothed new_people_vaccinated_smoothed_per_hundred
## 1 NA NA
## 2 NA NA
## 3 NA NA
## 4 NA NA
## 5 NA NA
## 6 NA NA
## stringency_index population population_density median_age aged_65_older
## 1 8.33 39835428 54.422 18.6 2.581
## 2 8.33 39835428 54.422 18.6 2.581
## 3 8.33 39835428 54.422 18.6 2.581
## 4 8.33 39835428 54.422 18.6 2.581
## 5 8.33 39835428 54.422 18.6 2.581
## 6 8.33 39835428 54.422 18.6 2.581
## aged_70_older gdp_per_capita extreme_poverty cardiovasc_death_rate
## 1 1.337 1803.987 NA 597.029
## 2 1.337 1803.987 NA 597.029
## 3 1.337 1803.987 NA 597.029
## 4 1.337 1803.987 NA 597.029
## 5 1.337 1803.987 NA 597.029
## 6 1.337 1803.987 NA 597.029
## diabetes_prevalence female_smokers male_smokers handwashing_facilities
## 1 9.59 NA NA 37.746
## 2 9.59 NA NA 37.746
## 3 9.59 NA NA 37.746
## 4 9.59 NA NA 37.746
## 5 9.59 NA NA 37.746
## 6 9.59 NA NA 37.746
## hospital_beds_per_thousand life_expectancy human_development_index
## 1 0.5 64.83 0.511
## 2 0.5 64.83 0.511
## 3 0.5 64.83 0.511
## 4 0.5 64.83 0.511
## 5 0.5 64.83 0.511
## 6 0.5 64.83 0.511
## excess_mortality_cumulative_absolute excess_mortality_cumulative
## 1 NA NA
## 2 NA NA
## 3 NA NA
## 4 NA NA
## 5 NA NA
## 6 NA NA
## excess_mortality excess_mortality_cumulative_per_million
## 1 NA NA
## 2 NA NA
## 3 NA NA
## 4 NA NA
## 5 NA NA
## 6 NA NA
dim(data)
## [1] 167246 67
There are over 1.6L records in this dataset. Plotting graphs will be extremely computationally intensive, thus, I have chosen to select a subset of the data to work with. For some parts I have used Indian data while for some other cases, I used UK. The reason for using two subset of data is to show trend in India and compare with other countries on the planet, and also for some attributes Indian subset of data does not have any records; however, UK does. TO show variety I have adopted this method.
Subsetting to Indian region:
data <- data[data$location == "India",]
data$date <- as.Date(data$date, format = "%Y-%m-%d")
head(data)
## iso_code continent location date total_cases new_cases
## 70066 IND Asia India 2020-01-30 1 1
## 70067 IND Asia India 2020-01-31 1 0
## 70068 IND Asia India 2020-02-01 1 0
## 70069 IND Asia India 2020-02-02 2 1
## 70070 IND Asia India 2020-02-03 3 1
## 70071 IND Asia India 2020-02-04 3 0
## new_cases_smoothed total_deaths new_deaths new_deaths_smoothed
## 70066 NA NA NA NA
## 70067 NA NA NA NA
## 70068 NA NA NA NA
## 70069 NA NA NA NA
## 70070 NA NA NA NA
## 70071 NA NA NA NA
## total_cases_per_million new_cases_per_million
## 70066 0.001 0.001
## 70067 0.001 0.000
## 70068 0.001 0.000
## 70069 0.001 0.001
## 70070 0.002 0.001
## 70071 0.002 0.000
## new_cases_smoothed_per_million total_deaths_per_million
## 70066 NA NA
## 70067 NA NA
## 70068 NA NA
## 70069 NA NA
## 70070 NA NA
## 70071 NA NA
## new_deaths_per_million new_deaths_smoothed_per_million reproduction_rate
## 70066 NA NA NA
## 70067 NA NA NA
## 70068 NA NA NA
## 70069 NA NA NA
## 70070 NA NA NA
## 70071 NA NA NA
## icu_patients icu_patients_per_million hosp_patients
## 70066 NA NA NA
## 70067 NA NA NA
## 70068 NA NA NA
## 70069 NA NA NA
## 70070 NA NA NA
## 70071 NA NA NA
## hosp_patients_per_million weekly_icu_admissions
## 70066 NA NA
## 70067 NA NA
## 70068 NA NA
## 70069 NA NA
## 70070 NA NA
## 70071 NA NA
## weekly_icu_admissions_per_million weekly_hosp_admissions
## 70066 NA NA
## 70067 NA NA
## 70068 NA NA
## 70069 NA NA
## 70070 NA NA
## 70071 NA NA
## weekly_hosp_admissions_per_million new_tests total_tests
## 70066 NA NA NA
## 70067 NA NA NA
## 70068 NA NA NA
## 70069 NA NA NA
## 70070 NA NA NA
## 70071 NA NA NA
## total_tests_per_thousand new_tests_per_thousand new_tests_smoothed
## 70066 NA NA NA
## 70067 NA NA NA
## 70068 NA NA NA
## 70069 NA NA NA
## 70070 NA NA NA
## 70071 NA NA NA
## new_tests_smoothed_per_thousand positive_rate tests_per_case tests_units
## 70066 NA NA NA
## 70067 NA NA NA
## 70068 NA NA NA
## 70069 NA NA NA
## 70070 NA NA NA
## 70071 NA NA NA
## total_vaccinations people_vaccinated people_fully_vaccinated
## 70066 NA NA NA
## 70067 NA NA NA
## 70068 NA NA NA
## 70069 NA NA NA
## 70070 NA NA NA
## 70071 NA NA NA
## total_boosters new_vaccinations new_vaccinations_smoothed
## 70066 NA NA NA
## 70067 NA NA NA
## 70068 NA NA NA
## 70069 NA NA NA
## 70070 NA NA NA
## 70071 NA NA NA
## total_vaccinations_per_hundred people_vaccinated_per_hundred
## 70066 NA NA
## 70067 NA NA
## 70068 NA NA
## 70069 NA NA
## 70070 NA NA
## 70071 NA NA
## people_fully_vaccinated_per_hundred total_boosters_per_hundred
## 70066 NA NA
## 70067 NA NA
## 70068 NA NA
## 70069 NA NA
## 70070 NA NA
## 70071 NA NA
## new_vaccinations_smoothed_per_million new_people_vaccinated_smoothed
## 70066 NA NA
## 70067 NA NA
## 70068 NA NA
## 70069 NA NA
## 70070 NA NA
## 70071 NA NA
## new_people_vaccinated_smoothed_per_hundred stringency_index population
## 70066 NA 10.19 1393409033
## 70067 NA 10.19 1393409033
## 70068 NA 10.19 1393409033
## 70069 NA 10.19 1393409033
## 70070 NA 10.19 1393409033
## 70071 NA 10.19 1393409033
## population_density median_age aged_65_older aged_70_older gdp_per_capita
## 70066 450.419 28.2 5.989 3.414 6426.674
## 70067 450.419 28.2 5.989 3.414 6426.674
## 70068 450.419 28.2 5.989 3.414 6426.674
## 70069 450.419 28.2 5.989 3.414 6426.674
## 70070 450.419 28.2 5.989 3.414 6426.674
## 70071 450.419 28.2 5.989 3.414 6426.674
## extreme_poverty cardiovasc_death_rate diabetes_prevalence female_smokers
## 70066 21.2 282.28 10.39 1.9
## 70067 21.2 282.28 10.39 1.9
## 70068 21.2 282.28 10.39 1.9
## 70069 21.2 282.28 10.39 1.9
## 70070 21.2 282.28 10.39 1.9
## 70071 21.2 282.28 10.39 1.9
## male_smokers handwashing_facilities hospital_beds_per_thousand
## 70066 20.6 59.55 0.53
## 70067 20.6 59.55 0.53
## 70068 20.6 59.55 0.53
## 70069 20.6 59.55 0.53
## 70070 20.6 59.55 0.53
## 70071 20.6 59.55 0.53
## life_expectancy human_development_index
## 70066 69.66 0.645
## 70067 69.66 0.645
## 70068 69.66 0.645
## 70069 69.66 0.645
## 70070 69.66 0.645
## 70071 69.66 0.645
## excess_mortality_cumulative_absolute excess_mortality_cumulative
## 70066 NA NA
## 70067 NA NA
## 70068 NA NA
## 70069 NA NA
## 70070 NA NA
## 70071 NA NA
## excess_mortality excess_mortality_cumulative_per_million
## 70066 NA NA
## 70067 NA NA
## 70068 NA NA
## 70069 NA NA
## 70070 NA NA
## 70071 NA NA
data$newNegativeCases <- data$new_tests - data$new_cases
library("ggplot2")
## Warning: package 'ggplot2' was built under R version 4.0.2
Now we will see plots for question 1 and 2 mentioned above. The following are plots exploring the positive cases, negative cases, hospitalisation, death rates, positive rates, etc with respect to time
The positive cases in India over time:
#data %>% drop_na(new_cases)
data <- data[complete.cases(data$new_cases),]
ggplot(data = data, aes(x = date, y = new_cases)) +geom_point() + labs(x = "Date", y = "trend", title = "New positive Cases",)
New negative cases over time in India
ggplot(data = data, aes(x = date, y = newNegativeCases)) +geom_point() + labs(x = "Date", y = "new negative cases",title = "New negative Cases",)
## Warning: Removed 76 rows containing missing values (geom_point).
Linear model:
model <- lm(data$new_cases~data$new_deaths+data$new_tests+data$positive_rate)
summary(model)
##
## Call:
## lm(formula = data$new_cases ~ data$new_deaths + data$new_tests +
## data$positive_rate)
##
## Residuals:
## Min 1Q Median 3Q Max
## -82559 -14127 2503 15743 85887
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.730e+04 2.506e+03 -30.85 <2e-16 ***
## data$new_deaths -6.941e-04 1.826e+00 0.00 1
## data$new_tests 5.511e-02 2.022e-03 27.25 <2e-16 ***
## data$positive_rate 1.435e+06 2.930e+04 48.96 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 26020 on 686 degrees of freedom
## (79 observations deleted due to missingness)
## Multiple R-squared: 0.9028, Adjusted R-squared: 0.9024
## F-statistic: 2124 on 3 and 686 DF, p-value: < 2.2e-16
Death rate due to COVID over time:
data$deathrate <- data$new_deaths/data$new_cases
library("ggplot2")
ggplot(data = data, aes(x = date, y = deathrate)) + geom_point() + labs(x = "Date",y = "Death Rate", title = "Death Rate due to covid",)
## Warning: Removed 46 rows containing missing values (geom_point).
Total number of cases in India over time:
newcase_ts <- ts(data$new_cases)
ggplot(data = data, aes(x = date, y = total_cases)) + geom_point() + labs(x = "Date", y = "number of cases", title = "Total number of cases wrt time")
Total number of negative cases in India over time:
ggplot(data = data, aes(x = date, y = newNegativeCases )) + geom_point() + labs(x = "Date", y = "Negative cases", title = "Total number of negative cases wrt time")
## Warning: Removed 76 rows containing missing values (geom_point).
new_cases_forecasts <- HoltWinters(newcase_ts, beta = FALSE, gamma = FALSE)
plot(new_cases_forecasts)
Now we shall see the scenario in the UK
Load the UK subset
df <- read.csv("COVIDdata.csv")
data <- df[df$location == "United Kingdom",]
data$date <- as.Date(data$date, format = "%Y-%m-%d")
New cases in UK over time
ggplot(data = data, aes(x = date, y = new_cases)) + geom_point() + labs(x = "Date", y = "new cases", title = "New cases wrt time") + geom_smooth(se=F)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 3 rows containing non-finite values (stat_smooth).
## Warning: Removed 3 rows containing missing values (geom_point).
Number of deaths over time: (cumulative)
ggplot(data = data, aes(x = date, y = total_deaths)) + geom_point() + labs(x = "Date", y = "Total deaths", title = "Total number of deaths wrt time") + geom_smooth(se=F)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 35 rows containing non-finite values (stat_smooth).
## Warning: Removed 35 rows containing missing values (geom_point).
Rate of postivity over time in the UK
ggplot(data = data, aes(x = date, y = positive_rate)) + geom_point() + labs(x = "Date", y = "rate of positive", title = "Positive rates wrt time") + geom_smooth(se=F)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 91 rows containing non-finite values (stat_smooth).
## Warning: Removed 91 rows containing missing values (geom_point).
Now that the visualisation for the first 2 questions are one, we shall see the next question - comparison between waves.
Here, I have assumed that the first wave is 2020, second is 2021 and third is 2022. This may not be the case in reality as the time period for each wave is much shorter. But I chose this to see the trend in the entire year. Also, it is very difficult to identify the start and end date of a wave. Many economists are not able to agree on common dates, thus, I chose to do based on the year. However, the code is same if we change the time period of the waves too.
Positive rates comparison between the thre waves of COVID-19 in the UK
first <- data
first <- first[first$date >= "2020-01-25" & first$date <= "2020-12-28", ]
second <- data
second <- second[second$date >= "2021-01-01" & second$date <= "2021-12-31", ]
third <- data
third <- third[third$date >= "2022-01-25" & third$date <= "2022-12-28", ]
ggplot(data = first, aes(x = date, y = new_cases)) + geom_point() + labs(x = "Date", y = "new cases", title = "number of new cases wrt time") + geom_smooth(se=F)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(data = second, aes(x = date, y = new_cases)) + geom_point() + labs(x = "Date", y = "new cases", title = "number of new cases wrt time") + geom_smooth(se=F)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).
ggplot(data = third, aes(x = date, y = new_cases)) + geom_point() + labs(x = "Date", y = "new cases", title = "number of new cases wrt time") + geom_smooth(se=F)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
We see that the number of new cases rose towards the end of 2020. Then in the second wave (2021) it began to decrease, but again rose towards the end of the year. Finally in the year 2022, it seems to be decreasing so far
Positive rates comparison between the thre waves of COVID-19 in the UK
ggplot(data = first, aes(x = date, y = positive_rate)) + geom_point() + labs(x = "Date", y = "positive rates", title = "Positive rates wrt time") + geom_smooth(se=F)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 67 rows containing non-finite values (stat_smooth).
## Warning: Removed 67 rows containing missing values (geom_point).
ggplot(data = second, aes(x = date, y = positive_rate)) + geom_point() + labs(x = "Date", y = "Cumulative cases", title = "Positive rates wrt time") + geom_smooth(se=F)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 14 rows containing non-finite values (stat_smooth).
## Warning: Removed 14 rows containing missing values (geom_point).
ggplot(data = third, aes(x = date, y = positive_rate)) + geom_point() + labs(x = "Date", y = "Cumulative cases", title = "Positive rates wrt time") + geom_smooth(se=F)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 10 rows containing non-finite values (stat_smooth).
## Warning: Removed 10 rows containing missing values (geom_point).
We notice that the positive rate was very high in 2020 May. THen it decreased over time and then again it began to rise and in the year 2021, it was very high and ths again went on to reduce. By mid 2021, this began rising once again and in the year 2022, it is decreasing so far. This is because when the number of cases (previous graphs) rose the positivty also rose. It decreased when the other decreased.
New deaths comparison between the thre waves of COVID-19 in the UK
ggplot(data = first, aes(x = date, y = new_deaths)) + geom_point() + labs(x = "Date", y = "deaths", title = "Total number of deaths wrt time") + geom_smooth(se=F) + geom_smooth(se=F)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 35 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 35 rows containing non-finite values (stat_smooth).
## Warning: Removed 35 rows containing missing values (geom_point).
ggplot(data = second, aes(x = date, y = new_deaths)) + geom_point() + labs(x = "Date", y = "deaths", title = "Total number of deaths wrt time") + geom_smooth(se=F) + geom_smooth(se=F)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(data = third, aes(x = date, y = new_deaths)) + geom_point() + labs(x = "Date", y = "deaths", title = "Total number of deaths wrt time") + geom_smooth(se=F) + geom_smooth(se=F)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
This also follows the same trend as the number of new cases and positivity.
We can see the same trends being followed in India as well.
Positive rates comparison between the thre waves of COVID-19 in India
data <- read.csv("COVIDdata.csv")
data <- data[data$location == "India",]
data$date <- as.Date(data$date, format = "%Y-%m-%d")
first <- data
first <- first[first$date >= "2020-01-25" & first$date <= "2020-12-28", ]
second <- data
second <- second[second$date >= "2021-01-01" & second$date <= "2021-12-31", ]
third <- data
third <- third[third$date >= "2022-01-25" & third$date <= "2022-12-28", ]
ggplot(data = first, aes(x = date, y = new_cases)) + geom_point() + labs(x = "Date", y = "new cases", title = " number of new cases wrt time") + geom_smooth(se=F)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(data = second, aes(x = date, y = new_cases)) + geom_point() + labs(x = "Date", y = "new cases", title = " number of new cases wrt time") + geom_smooth(se=F)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(data = third, aes(x = date, y = new_cases)) + geom_point() + labs(x = "Date", y = "new cases", title = " number of new cases wrt time") + geom_smooth(se=F)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
The number of cases slowly increased in the second half of 2020. Then it decreased by december and january in 2021. But then we went into 2nd wave of COVID where it peaked once again in March and April. This decreased over time in the rest of the year, and this decrease is being observed in 2022 also.
Positive rates comparison between the three waves of COVID-19 in India
ggplot(data = first, aes(x = date, y = positive_rate)) + geom_point() + labs(x = "Date", y = "positive cases", title = "Positive rates wrt time") + geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 50 rows containing non-finite values (stat_smooth).
## Warning: Removed 50 rows containing missing values (geom_point).
ggplot(data = second, aes(x = date, y = positive_rate)) + geom_point() + labs(x = "Date", y = "positive cases", title = "Positive rates wrt time") + geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(data = third, aes(x = date, y = positive_rate)) + geom_point() + labs(x = "Date", y = "positive cases", title = "Positive rates wrt time") + geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).
As mentioned above, the similar trend is being followed by positive rates too
New deaths comparison between the thre waves of COVID-19 in India
ggplot(data = first, aes(x = date, y = new_deaths)) + geom_point() + labs(x = "Date", y = "Deaths", title = "Total number of Deaths wrt time") + geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 42 rows containing non-finite values (stat_smooth).
## Warning: Removed 42 rows containing missing values (geom_point).
ggplot(data = second, aes(x = date, y = new_deaths)) + geom_point() + labs(x = "Date", y = "Deaths", title = "Total number of Deaths wrt time") + geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
ggplot(data = third, aes(x = date, y = new_deaths)) + geom_point() + labs(x = "Date", y = "Deaths", title = "Total number of Deaths wrt time") + geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
As mentioned above, the similar trend is being followed by deaths too.
Now, we will move to question 4 and 5
Forecasting:
new_cases_forecasts2 <- forecast:::forecast.HoltWinters(new_cases_forecasts, h=30)
plot(forecast(new_cases_forecasts2))
SMA:
data <- read.csv("COVIDdata.csv")
data <- data[data$location == "India",]
data$date <- as.Date(data$date, format = "%Y-%m-%d")
data <- data[complete.cases(data$new_cases),]
new_cases_rep <- ts(data$new_cases)
new_cases_sma <- SMA(new_cases_rep,n=3)
plot(new_cases_sma)
ma2 <- SMA(data$new_cases, level=.95, frequency=2)
ma2
## [1] NA NA NA NA NA NA NA NA
## [9] NA 0.3 0.2 0.2 0.2 0.1 0.0 0.0
## [17] 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
## [25] 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
## [33] 0.2 0.2 2.5 2.7 2.8 3.1 3.6 4.0
## [41] 5.3 5.9 6.8 7.7 7.4 8.3 8.8 10.8
## [49] 11.7 15.1 18.8 26.8 32.3 41.7 43.4 54.4
## [57] 60.8 74.5 83.1 83.0 100.7 106.7 160.2 204.4
## [65] 203.1 242.5 286.1 389.1 432.4 489.2 547.4 620.1
## [73] 644.8 666.2 788.6 840.5 873.4 865.2 904.1 980.6
## [81] 1089.0 1094.1 1163.4 1216.5 1262.4 1304.3 1396.1 1446.0
## [89] 1509.9 1560.2 1544.7 1632.4 1717.7 1832.9 1942.8 2190.7
## [97] 2311.7 2509.7 2690.0 2837.1 2974.6 3229.8 3351.1 3459.3
## [105] 3555.0 3556.0 3638.4 3766.1 3934.7 4063.3 4366.7 4486.7
## [113] 4745.8 5050.2 5336.8 5653.9 5916.6 6014.5 6238.8 6505.8
## [121] 6701.6 6979.9 7238.3 7357.6 7576.8 7828.8 8176.3 8539.1
## [129] 8853.6 9210.0 9243.7 9431.9 9599.6 9916.5 10180.2 10409.8
## [137] 10571.1 10690.7 10744.3 10946.0 11460.4 11890.2 12384.6 12774.7
## [145] 13122.2 13526.1 14068.1 14731.0 15488.8 16191.3 16778.6 17179.2
## [153] 17503.0 17935.9 18532.9 19213.2 20006.0 20701.2 21071.1 21355.8
## [161] 21897.8 22696.2 23543.5 24488.1 25271.0 25843.7 26301.6 27144.4
## [169] 28416.8 29666.7 31048.5 32440.4 33442.2 34355.6 36054.4 38135.6
## [177] 40084.3 41677.8 43178.4 44098.9 45388.8 46375.7 47940.8 50291.0
## [185] 51192.5 51558.7 51872.1 52261.9 52892.0 54700.1 55694.2 57104.7
## [193] 58032.8 57268.7 57891.5 59294.2 60544.5 61766.8 62541.6 62058.9
## [201] 61407.0 61424.3 62185.1 63715.0 64606.3 64830.3 64515.8 64140.1
## [209] 63459.5 66257.1 68481.9 69671.9 70580.8 71542.0 71546.5 72458.3
## [217] 74705.8 76942.4 79863.2 80357.7 81711.3 81645.0 82739.5 84461.8
## [225] 87124.8 89046.1 90095.0 90968.0 90705.7 90654.8 91364.0 93425.5
## [233] 93788.6 93475.6 92516.6 90267.9 89165.4 88609.1 88833.4 88357.3
## [241] 87427.9 86002.5 83727.7 82514.4 82500.4 83140.5 82753.4 81685.5
## [249] 80524.5 78115.0 76459.9 76095.3 76086.0 75366.0 74122.2 72647.0
## [257] 70233.6 69001.6 68328.2 68538.6 67554.9 65889.6 64412.2 61764.0
## [265] 59730.1 58640.8 58543.2 57529.3 55771.4 53949.1 51374.9 49577.1
## [273] 48993.0 49178.8 48601.2 47713.6 46800.1 45294.1 44906.5 45412.7
## [281] 46529.5 47175.8 46755.1 46480.6 45461.1 45192.9 45460.3 46117.2
## [289] 45960.3 45049.3 43340.3 41221.0 40515.3 40482.6 41263.5 41458.6
## [297] 41189.0 41107.0 40436.1 40763.7 42157.8 43549.7 43820.2 43443.6
## [305] 42732.6 41221.2 40360.7 39509.9 39371.9 38599.5 37751.7 36741.6
## [313] 35266.1 34293.1 33568.0 33393.5 32736.2 32206.5 31254.1 29795.4
## [321] 28832.5 27935.4 27567.7 26874.9 26385.2 25881.6 24834.1 24203.7
## [329] 23967.8 24068.0 23657.1 23129.3 22842.4 21970.4 21362.9 21111.4
## [337] 21159.3 20672.2 20018.7 19362.4 18772.6 18708.2 18740.7 18911.4
## [345] 16856.5 18361.0 17988.6 17339.1 17118.2 17162.4 17083.9 16790.9
## [353] 16270.7 15835.6 16840.6 14535.5 14428.8 14624.9 14453.7 14244.0
## [361] 14005.3 13399.7 13154.2 12942.0 13822.5 13749.1 13529.1 13218.2
## [369] 12656.1 12275.1 12244.7 12575.3 12477.7 12517.0 11814.6 11417.4
## [377] 11219.7 11368.4 11435.8 11546.2 11475.7 11399.8 11140.6 11095.7
## [385] 11200.7 11609.0 11901.6 12035.7 12524.7 12368.8 12523.6 13032.5
## [393] 13778.1 14265.9 14653.0 14884.7 14714.0 14786.5 15107.3 15732.7
## [401] 16186.9 16388.5 16590.7 16480.7 16597.6 17332.0 18431.9 19421.2
## [409] 20212.5 21157.8 21778.6 22793.5 24520.7 26954.5 29257.7 31356.9
## [417] 33723.5 35306.8 37501.0 40219.5 43682.1 47017.6 49701.9 52531.3
## [425] 54057.1 55020.5 57558.4 61633.5 65820.2 69797.5 74241.5 77713.9
## [433] 83016.1 88893.0 96468.7 105659.1 113714.0 122458.6 129719.3 138831.6
## [441] 148549.7 160586.8 172482.4 185942.9 200126.3 211504.6 225732.5 240305.7
## [449] 257424.2 273665.6 288560.8 302124.6 310957.7 320911.0 331461.6 344200.4
## [457] 354883.9 362668.3 366182.2 367235.2 370480.7 376424.7 385541.2 389556.3
## [465] 391966.0 389959.9 382754.8 378348.1 377814.8 376397.6 370792.8 360666.7
## [473] 347386.5 333632.0 320024.9 310986.5 303947.4 294835.2 282646.7 270563.8
## [481] 257596.7 247371.8 240363.0 232646.1 223291.7 212236.0 201554.3 188575.4
## [489] 177770.0 168953.9 162547.6 153708.4 144024.6 135451.8 126722.6 119426.9
## [497] 113499.8 109977.9 105132.3 99800.3 93606.0 87600.2 82376.6 79033.8
## [505] 76632.0 73447.7 69924.0 66039.8 61870.6 58872.0 57236.8 56356.4
## [513] 55003.8 53287.0 51653.8 49335.1 48107.6 47641.3 48039.0 47365.3
## [521] 46265.5 45078.4 43678.9 43048.2 43022.6 43605.3 43286.8 42558.8
## [529] 41612.5 40492.0 40071.4 40265.1 40689.7 40124.3 39650.8 39127.9
## [537] 37860.6 37911.5 38334.4 38578.0 38601.2 38402.1 38443.3 37604.3
## [545] 37854.0 38388.5 39802.2 39765.6 39810.4 40289.6 39434.8 39723.1
## [553] 40085.2 41580.6 41078.0 40634.1 39761.0 38416.5 38068.7 38174.8
## [561] 39131.9 38736.1 38046.2 36875.6 35529.4 35140.2 35230.4 36067.1
## [569] 35677.5 34652.8 33148.0 31828.0 31979.0 33301.7 35250.9 36409.0
## [577] 37277.2 37911.0 37559.4 38661.1 40863.1 42851.6 43354.1 43014.3
## [585] 42443.3 40889.6 40168.8 40204.2 40607.4 39748.5 37898.4 36088.6
## [593] 34367.2 32808.2 31970.4 32288.5 32067.2 30818.2 30346.5 29620.4
## [601] 29457.7 29924.6 30522.4 30766.4 30542.0 29705.8 28019.1 26828.8
## [609] 26156.1 26217.3 25956.3 25048.2 23989.9 22862.9 21913.6 21552.6
## [617] 21798.8 21885.8 21349.5 20490.0 19485.9 18784.0 18602.8 18454.4
## [625] 18169.2 17340.7 16574.6 15906.4 15552.1 15584.3 15731.6 15781.9
## [633] 15473.8 15218.2 14862.9 14793.4 15049.4 15178.4 15147.4 14585.0
## [641] 14257.8 13667.5 13267.2 13125.1 13155.2 12903.0 12370.0 12083.0
## [649] 11664.3 11527.9 11585.6 11794.9 11789.6 11628.2 11378.2 11171.8
## [657] 11108.9 11153.0 11251.0 11134.6 10874.3 10471.5 10044.4 9845.6
## [665] 9734.6 9903.0 9715.1 9400.6 9120.9 8789.7 8636.3 8764.0
## [673] 8927.7 8859.7 7947.8 8613.0 8463.4 8429.9 8540.9 8682.1
## [681] 8585.7 8396.9 8210.3 7928.4 8626.8 7704.1 7766.6 6922.7
## [689] 6695.3 7219.5 6953.1 6797.1 6811.6 6898.2 6918.7 6820.0
## [697] 6728.4 7364.2 7569.2 7520.2 8664.0 10309.8 12315.6 15025.6
## [705] 18044.6 23155.6 31595.3 42669.5 55948.6 70596.4 86892.3 101421.1
## [713] 118137.8 139504.5 162186.8 183260.4 201287.8 215386.7 224989.9 237323.7
## [721] 251104.6 269023.7 283322.1 291933.7 296119.9 294824.0 296295.2 299124.7
## [729] 300443.8 295700.0 287374.9 273641.3 256576.8 239362.1 225999.0 215351.0
## [737] 199554.8 181663.8 164930.5 148137.0 131845.4 117562.0 106663.8 95565.9
## [745] 82810.3 71282.2 61227.9 53542.0 48230.1 44062.4 39152.9 34441.3
## [753] 30238.7 26538.5 23561.0 21564.5 20140.2 18228.6 16180.2 14389.5
## [761] 12854.0 11612.6 10663.6 9962.7 9044.6 8177.4 7297.0 6546.4
## [769] 5976.6
fma2 <- forecast(ma2, h=30)
## Warning in ets(object, lambda = lambda, biasadj = biasadj,
## allow.multiplicative.trend = allow.multiplicative.trend, : Missing values
## encountered. Using longest contiguous portion of time series
fma2
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 770 5426.9388 3915.039 6938.838 3114.6880 7739.190
## 771 4896.7398 1564.054 8229.426 -200.1638 9993.643
## 772 4385.2993 -1115.800 9886.399 -4027.9067 12798.505
## 773 3891.9539 -4053.862 11837.770 -8260.1232 16044.031
## 774 3416.0631 -7201.486 14033.612 -12822.0767 19654.203
## 775 2957.0095 -10522.835 16436.854 -17658.6333 23572.652
## 776 2514.1974 -13990.136 19018.530 -22727.0017 27755.397
## 777 2087.0522 -17581.084 21755.188 -27992.7665 32166.871
## 778 1675.0195 -21277.303 24627.342 -33427.5280 36777.567
## 779 1277.5647 -25063.355 27618.485 -39007.3959 41562.525
## 780 894.1719 -28926.085 30714.429 -44711.9752 46500.319
## 781 524.3438 -32854.146 33902.834 -50523.6516 51572.339
## 782 167.6003 -36837.666 37172.867 -56427.0712 56762.272
## 783 -176.5216 -40867.988 40514.945 -62408.7492 62055.706
## 784 -508.4682 -44937.474 43920.538 -68456.7697 67439.833
## 785 -828.6705 -49039.355 47382.014 -74560.5493 72903.208
## 786 -1137.5439 -53167.600 50892.512 -80710.6481 78435.560
## 787 -1435.4892 -57316.825 54445.846 -86898.6166 84027.638
## 788 -1722.8931 -61482.202 58036.416 -93116.8694 89671.083
## 789 -2000.1286 -65659.398 61659.141 -99358.5801 95358.323
## 790 -2267.5554 -69844.513 65309.402 -105617.5929 101082.482
## 791 -2525.5206 -74034.030 68982.989 -111888.3477 106837.307
## 792 -2774.3588 -78224.777 72676.059 -118165.8155 112617.098
## 793 -3014.3931 -82413.889 76385.103 -124445.4430 118416.657
## 794 -3245.9348 -86598.776 80106.906 -130723.1044 124231.235
## 795 -3469.2845 -90777.096 83838.527 -136995.0597 130056.491
## 796 -3684.7321 -94946.732 87577.268 -143257.9172 135888.453
## 797 -3892.5570 -99105.771 91320.657 -149508.6012 141723.487
## 798 -4093.0290 -103252.480 95066.422 -155744.3228 147558.265
## 799 -4286.4083 -107385.299 98812.482 -161962.5547 153389.738
plot(fma2)
ARIMA:
arima_model <- auto.arima(new_cases_rep)
summary(arima_model)
## Series: new_cases_rep
## ARIMA(3,1,3)
##
## Coefficients:
## ar1 ar2 ar3 ma1 ma2 ma3
## 0.4607 -0.5189 0.9037 -0.2701 0.4596 -0.6868
## s.e. 0.0275 0.0332 0.0227 0.0599 0.0612 0.0310
##
## sigma^2 estimated as 55449628: log likelihood=-7934.61
## AIC=15883.23 AICc=15883.37 BIC=15915.73
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -2.633866 7412.482 4109.46 NaN Inf 0.882021 0.06448811
SMA_model <- SMA(new_cases_rep,n=3)
summary(SMA_model)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0 10994 29097 56025 59169 409232 2
EMA:
EMA_model <- EMA(new_cases_rep,n=3)
summary(EMA_model)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0 10815 28960 56025 58850 404008 2
Construct the model using the Simple moving average, Exponential Moving Average and ARIMA to forecast the Covid19 Dataset.
autoplot(acf(na.omit(SMA_model),plot=FALSE))+ labs(title="ACF Plot")
autoplot(pacf(na.omit(SMA_model),plot=FALSE))+ labs(title="PACF Plot")
autoplot(acf(na.omit(EMA_model),plot=FALSE))+ labs(title="ACF Plot")
autoplot(pacf(na.omit(EMA_model),plot=FALSE))+ labs(title="PACF Plot")